evals = read.csv("evals.csv")
head(evals)
## score rank ethnicity gender language age cls_perc_eval
## 1 4.7 tenure track minority female english 36 55.81395
## 2 4.1 tenure track minority female english 36 68.80000
## 3 3.9 tenure track minority female english 36 60.80000
## 4 4.8 tenure track minority female english 36 62.60163
## 5 4.6 tenured not minority male english 59 85.00000
## 6 4.3 tenured not minority male english 59 87.50000
## cls_did_eval cls_students cls_level cls_profs cls_credits bty_f1lower
## 1 24 43 upper single multi credit 5
## 2 86 125 upper single multi credit 5
## 3 76 125 upper single multi credit 5
## 4 77 123 upper single multi credit 5
## 5 17 20 upper multiple multi credit 4
## 6 35 40 upper multiple multi credit 4
## bty_f1upper bty_f2upper bty_m1lower bty_m1upper bty_m2upper bty_avg
## 1 7 6 2 4 6 5
## 2 7 6 2 4 6 5
## 3 7 6 2 4 6 5
## 4 7 6 2 4 6 5
## 5 4 2 2 3 3 3
## 6 4 2 2 3 3 3
## pic_outfit pic_color
## 1 not formal color
## 2 not formal color
## 3 not formal color
## 4 not formal color
## 5 not formal color
## 6 not formal color
summary(evals)
## score rank ethnicity gender
## Min. :2.300 teaching :102 minority : 64 female:195
## 1st Qu.:3.800 tenured :253 not minority:399 male :268
## Median :4.300 tenure track:108
## Mean :4.175
## 3rd Qu.:4.600
## Max. :5.000
## language age cls_perc_eval cls_did_eval
## english :435 Min. :29.00 Min. : 10.42 Min. : 5.00
## non-english: 28 1st Qu.:42.00 1st Qu.: 62.70 1st Qu.: 15.00
## Median :48.00 Median : 76.92 Median : 23.00
## Mean :48.37 Mean : 74.43 Mean : 36.62
## 3rd Qu.:57.00 3rd Qu.: 87.25 3rd Qu.: 40.00
## Max. :73.00 Max. :100.00 Max. :380.00
## cls_students cls_level cls_profs cls_credits
## Min. : 8.00 lower:157 multiple:306 multi credit:436
## 1st Qu.: 19.00 upper:306 single :157 one credit : 27
## Median : 29.00
## Mean : 55.18
## 3rd Qu.: 60.00
## Max. :581.00
## bty_f1lower bty_f1upper bty_f2upper bty_m1lower
## Min. :1.000 Min. :1.000 Min. : 1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:4.000 1st Qu.: 4.000 1st Qu.:2.000
## Median :4.000 Median :5.000 Median : 5.000 Median :3.000
## Mean :3.963 Mean :5.019 Mean : 5.214 Mean :3.413
## 3rd Qu.:5.000 3rd Qu.:7.000 3rd Qu.: 6.000 3rd Qu.:5.000
## Max. :8.000 Max. :9.000 Max. :10.000 Max. :7.000
## bty_m1upper bty_m2upper bty_avg pic_outfit
## Min. :1.000 Min. :1.000 Min. :1.667 formal : 77
## 1st Qu.:3.000 1st Qu.:4.000 1st Qu.:3.167 not formal:386
## Median :4.000 Median :5.000 Median :4.333
## Mean :4.147 Mean :4.752 Mean :4.418
## 3rd Qu.:5.000 3rd Qu.:6.000 3rd Qu.:5.500
## Max. :9.000 Max. :9.000 Max. :8.167
## pic_color
## black&white: 78
## color :385
##
##
##
##
Exercise 1:
This is an observational study.
Exercise 2:
hist(evals$score, main="Course Score", xlab="Score", col="grey", freq=FALSE)
qqnorm(evals$score)
#The distribution of score is skewed left, otherwise would be normal. This tells us that students tend to rate courses higher than the middle of the scale on average, reserving lower scores for serious conditions. This is an expected distribution for subjective scores or ratings.
Exercise 3:
plot(evals$rank, evals$score, main="Y vs Predictor", ylab="Score", xlab="Professor Rank", col="blue")
hist(evals$cls_perc_eval, main="Percentage of students who completed eval", xlab="Percent", col="grey", freq=FALSE)
plot(evals$cls_perc_eval, evals$score, main="Y vs Predictor", ylab="Score", xlab="Percent of students who completed eval", col="blue")
#Professor rank seems to have an impact on course score, specifically teaching professors seem to perform better.
#Percentage of students who completed evaluations also seems to correlate with the score, with higher-percentage evals leading to higher overall course score
plot(evals$score ~ evals$bty_avg)
Exercise 4:
plot(evals$score ~ jitter(rep(evals$bty_avg)))
#The original plot hid values as there were many points laid on top of one another
Exercise 5:
m_bty = lm(score ~ bty_avg, data=evals)
summary(m_bty)
##
## Call:
## lm(formula = score ~ bty_avg, data = evals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9246 -0.3690 0.1420 0.3977 0.9309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.88034 0.07614 50.96 < 2e-16 ***
## bty_avg 0.06664 0.01629 4.09 5.08e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5348 on 461 degrees of freedom
## Multiple R-squared: 0.03502, Adjusted R-squared: 0.03293
## F-statistic: 16.73 on 1 and 461 DF, p-value: 5.083e-05
plot(score ~ bty_avg, data=evals)
abline(m_bty)
# equation: bty = 0.06664 * bty_avg + 388034
# Based solely on the p-value, average beauty seems to be a statistically significant predictor
Exercise 6:
plot(m_bty)
# Based on the diagnostic plots, especially the normal Q-Q plot, the model does not seem to be a good fit.
Exercise 7:
plot(evals$bty_avg ~ evals$bty_f1lower)
cor(evals$bty_avg, evals$bty_f1lower)
## [1] 0.8439112
plot(evals[,13:19])
m_bty_gen <- lm(score ~ bty_avg + gender, data = evals)
summary(m_bty_gen)
##
## Call:
## lm(formula = score ~ bty_avg + gender, data = evals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8305 -0.3625 0.1055 0.4213 0.9314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.74734 0.08466 44.266 < 2e-16 ***
## bty_avg 0.07416 0.01625 4.563 6.48e-06 ***
## gendermale 0.17239 0.05022 3.433 0.000652 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5287 on 460 degrees of freedom
## Multiple R-squared: 0.05912, Adjusted R-squared: 0.05503
## F-statistic: 14.45 on 2 and 460 DF, p-value: 8.177e-07
plot(m_bty_gen)
Exercise 8:
# Again, based on p-value, bty_average seems to be a significant predictor of score but the linear model again seems to be non-normal
Exercise 9:
# plot score versus bty_avg and include the regression lines for males and females on the same plot (in different colors!)
plot(score ~ bty_avg, data=evals)
abline(a = 3.74734, b = .07416, col="pink")
abline(a = (3.74734 + .17239), b = .07416, col="blue")
# The equation is score = 0.07416 * bty_avg + 0.17239 + 3.374734
# Males seem to give higher scores
Exercise 10:
m_bty_rank <- lm(score ~ bty_avg + rank, data = evals)
summary(m_bty_rank)
##
## Call:
## lm(formula = score ~ bty_avg + rank, data = evals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8713 -0.3642 0.1489 0.4103 0.9525
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.98155 0.09078 43.860 < 2e-16 ***
## bty_avg 0.06783 0.01655 4.098 4.92e-05 ***
## ranktenured -0.12623 0.06266 -2.014 0.0445 *
## ranktenure track -0.16070 0.07395 -2.173 0.0303 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5328 on 459 degrees of freedom
## Multiple R-squared: 0.04652, Adjusted R-squared: 0.04029
## F-statistic: 7.465 on 3 and 459 DF, p-value: 6.88e-05
plot(m_bty_rank)
# R creates (n-1) predictors for n categorical variables
Exercise 11:
# I would expect the number of professors to have a high p-value
Exercise 12:
m_full <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students + cls_level + cls_profs + cls_credits + bty_avg + pic_outfit + pic_color, data = evals)
summary(m_full)
##
## Call:
## lm(formula = score ~ rank + ethnicity + gender + language + age +
## cls_perc_eval + cls_students + cls_level + cls_profs + cls_credits +
## bty_avg + pic_outfit + pic_color, data = evals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.77397 -0.32432 0.09067 0.35183 0.95036
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.0952141 0.2905277 14.096 < 2e-16 ***
## ranktenured -0.0973378 0.0663296 -1.467 0.14295
## ranktenure track -0.1475932 0.0820671 -1.798 0.07278 .
## ethnicitynot minority 0.1234929 0.0786273 1.571 0.11698
## gendermale 0.2109481 0.0518230 4.071 5.54e-05 ***
## languagenon-english -0.2298112 0.1113754 -2.063 0.03965 *
## age -0.0090072 0.0031359 -2.872 0.00427 **
## cls_perc_eval 0.0053272 0.0015393 3.461 0.00059 ***
## cls_students 0.0004546 0.0003774 1.205 0.22896
## cls_levelupper 0.0605140 0.0575617 1.051 0.29369
## cls_profssingle -0.0146619 0.0519885 -0.282 0.77806
## cls_creditsone credit 0.5020432 0.1159388 4.330 1.84e-05 ***
## bty_avg 0.0400333 0.0175064 2.287 0.02267 *
## pic_outfitnot formal -0.1126817 0.0738800 -1.525 0.12792
## pic_colorcolor -0.2172630 0.0715021 -3.039 0.00252 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.498 on 448 degrees of freedom
## Multiple R-squared: 0.1871, Adjusted R-squared: 0.1617
## F-statistic: 7.366 on 14 and 448 DF, p-value: 6.552e-14
# The number of professors in a class has the highest p-value
Exercise 13:
# The coefficient for ethnicity states for professors who are not a minority, scores are generally 0.1234929 points higher than those of minority professors
Exercise 14:
m_partial <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students + cls_level + cls_credits + bty_avg + pic_outfit + pic_color, data = evals)
summary(m_partial)
##
## Call:
## lm(formula = score ~ rank + ethnicity + gender + language + age +
## cls_perc_eval + cls_students + cls_level + cls_credits +
## bty_avg + pic_outfit + pic_color, data = evals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7836 -0.3257 0.0859 0.3513 0.9551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.0872523 0.2888562 14.150 < 2e-16 ***
## ranktenured -0.0973829 0.0662614 -1.470 0.142349
## ranktenure track -0.1476746 0.0819824 -1.801 0.072327 .
## ethnicitynot minority 0.1274458 0.0772887 1.649 0.099856 .
## gendermale 0.2101231 0.0516873 4.065 5.66e-05 ***
## languagenon-english -0.2282894 0.1111305 -2.054 0.040530 *
## age -0.0089992 0.0031326 -2.873 0.004262 **
## cls_perc_eval 0.0052888 0.0015317 3.453 0.000607 ***
## cls_students 0.0004687 0.0003737 1.254 0.210384
## cls_levelupper 0.0606374 0.0575010 1.055 0.292200
## cls_creditsone credit 0.5061196 0.1149163 4.404 1.33e-05 ***
## bty_avg 0.0398629 0.0174780 2.281 0.023032 *
## pic_outfitnot formal -0.1083227 0.0721711 -1.501 0.134080
## pic_colorcolor -0.2190527 0.0711469 -3.079 0.002205 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4974 on 449 degrees of freedom
## Multiple R-squared: 0.187, Adjusted R-squared: 0.1634
## F-statistic: 7.943 on 13 and 449 DF, p-value: 2.336e-14
# The coefficients and significance did change. This indicates the dropped variable was not collinear with the other values.
Exercise 15:
m_partial <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students + cls_credits + bty_avg + pic_outfit + pic_color, data = evals)
summary(m_partial)
##
## Call:
## lm(formula = score ~ rank + ethnicity + gender + language + age +
## cls_perc_eval + cls_students + cls_credits + bty_avg + pic_outfit +
## pic_color, data = evals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7761 -0.3187 0.0875 0.3547 0.9367
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.0856255 0.2888881 14.143 < 2e-16 ***
## ranktenured -0.0895940 0.0658566 -1.360 0.174372
## ranktenure track -0.1420696 0.0818201 -1.736 0.083184 .
## ethnicitynot minority 0.1424342 0.0759800 1.875 0.061491 .
## gendermale 0.2037722 0.0513416 3.969 8.40e-05 ***
## languagenon-english -0.2093185 0.1096785 -1.908 0.056966 .
## age -0.0087287 0.0031224 -2.795 0.005404 **
## cls_perc_eval 0.0053545 0.0015306 3.498 0.000515 ***
## cls_students 0.0003573 0.0003585 0.997 0.319451
## cls_creditsone credit 0.4733728 0.1106549 4.278 2.31e-05 ***
## bty_avg 0.0410340 0.0174449 2.352 0.019092 *
## pic_outfitnot formal -0.1172152 0.0716857 -1.635 0.102722
## pic_colorcolor -0.1973196 0.0681052 -2.897 0.003948 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4975 on 450 degrees of freedom
## Multiple R-squared: 0.185, Adjusted R-squared: 0.1632
## F-statistic: 8.51 on 12 and 450 DF, p-value: 1.275e-14
m_partial <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_credits + bty_avg + pic_outfit + pic_color, data = evals)
summary(m_partial)
##
## Call:
## lm(formula = score ~ rank + ethnicity + gender + language + age +
## cls_perc_eval + cls_credits + bty_avg + pic_outfit + pic_color,
## data = evals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.78424 -0.31397 0.09261 0.35904 0.92154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.152893 0.280892 14.785 < 2e-16 ***
## ranktenured -0.083092 0.065532 -1.268 0.205469
## ranktenure track -0.142239 0.081819 -1.738 0.082814 .
## ethnicitynot minority 0.143509 0.075972 1.889 0.059535 .
## gendermale 0.208080 0.051159 4.067 5.61e-05 ***
## languagenon-english -0.222515 0.108876 -2.044 0.041558 *
## age -0.009074 0.003103 -2.924 0.003629 **
## cls_perc_eval 0.004841 0.001441 3.359 0.000849 ***
## cls_creditsone credit 0.472669 0.110652 4.272 2.37e-05 ***
## bty_avg 0.043578 0.017257 2.525 0.011903 *
## pic_outfitnot formal -0.136594 0.068998 -1.980 0.048347 *
## pic_colorcolor -0.189905 0.067697 -2.805 0.005246 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4975 on 451 degrees of freedom
## Multiple R-squared: 0.1832, Adjusted R-squared: 0.1632
## F-statistic: 9.193 on 11 and 451 DF, p-value: 6.364e-15
m_partial_bad <- lm(score ~ ethnicity + gender + language + age + cls_perc_eval + cls_credits + bty_avg + pic_outfit + pic_color, data = evals)
summary(m_partial_bad)
##
## Call:
## lm(formula = score ~ ethnicity + gender + language + age + cls_perc_eval +
## cls_credits + bty_avg + pic_outfit + pic_color, data = evals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8455 -0.3221 0.1013 0.3745 0.9051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.907030 0.244889 15.954 < 2e-16 ***
## ethnicitynot minority 0.163818 0.075158 2.180 0.029798 *
## gendermale 0.202597 0.050102 4.044 6.18e-05 ***
## languagenon-english -0.246683 0.106146 -2.324 0.020567 *
## age -0.006925 0.002658 -2.606 0.009475 **
## cls_perc_eval 0.004942 0.001442 3.427 0.000666 ***
## cls_creditsone credit 0.517205 0.104141 4.966 9.68e-07 ***
## bty_avg 0.046732 0.017091 2.734 0.006497 **
## pic_outfitnot formal -0.113939 0.067168 -1.696 0.090510 .
## pic_colorcolor -0.180870 0.067456 -2.681 0.007601 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4982 on 453 degrees of freedom
## Multiple R-squared: 0.1774, Adjusted R-squared: 0.161
## F-statistic: 10.85 on 9 and 453 DF, p-value: 2.441e-15
# m_partial is best model
Exercise 16:
plot(m_partial)
# q-q plot still indicates this is not a great linear, normal model
Exercise 17:
# Absolutely, professors who teach many courses can skew the data. This will lead to an inaccurate linear model
Exercise 18:
# A professor with a high eval score will be a teaching professor, non-minority, male, english speaker, young, have high evaluation rates, teach high-credit courses, be attractive, dress formally for pictures, and have their pictures be in color
Exercise 19:
# I would not. The model is a poor fit, the data is skewed by the methodology of data selection and organization, predictors such as info about photos and beauty ratings seem to have little predictive value, and the study was from an single university
Exercise 20:
# ELECTORAL COLLEGE!
Exercise 2:
interaction = lm(score ~ bty_avg + gender + bty_avg*gender, data = evals)
summary(interaction)
##
## Call:
## lm(formula = score ~ bty_avg + gender + bty_avg * gender, data = evals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8084 -0.3828 0.0903 0.4037 0.9211
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.95006 0.11800 33.475 <2e-16 ***
## bty_avg 0.03064 0.02400 1.277 0.2024
## gendermale -0.18351 0.15349 -1.196 0.2325
## bty_avg:gendermale 0.07962 0.03247 2.452 0.0146 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5258 on 459 degrees of freedom
## Multiple R-squared: 0.07129, Adjusted R-squared: 0.06522
## F-statistic: 11.74 on 3 and 459 DF, p-value: 1.997e-07
Exercise 3:
# function = 0.03064*bty_avg + 3.95006
Exercise 4:
# for men = 0.03064*bty_avg - 0.18351 + 3.95006
# for women = 0.03064*bty_avg + 0.18351 + 3.95006
plot(score ~ bty_avg, data=evals)
plot(score ~ bty_avg, data=evals)
abline(a = (3.74734 - .17239), b = .07416, col="pink")
abline(a = (3.74734 + .17239), b = .07416, col="blue")
Exercise 5:
library(leaps)
mod_sel <- regsubsets(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students + cls_level + cls_profs + cls_credits + bty_avg + pic_outfit + pic_color, data = evals)
summary(mod_sel)
## Subset selection object
## Call: regsubsets.formula(score ~ rank + ethnicity + gender + language +
## age + cls_perc_eval + cls_students + cls_level + cls_profs +
## cls_credits + bty_avg + pic_outfit + pic_color, data = evals)
## 14 Variables (and intercept)
## Forced in Forced out
## ranktenured FALSE FALSE
## ranktenure track FALSE FALSE
## ethnicitynot minority FALSE FALSE
## gendermale FALSE FALSE
## languagenon-english FALSE FALSE
## age FALSE FALSE
## cls_perc_eval FALSE FALSE
## cls_students FALSE FALSE
## cls_levelupper FALSE FALSE
## cls_profssingle FALSE FALSE
## cls_creditsone credit FALSE FALSE
## bty_avg FALSE FALSE
## pic_outfitnot formal FALSE FALSE
## pic_colorcolor FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## ranktenured ranktenure track ethnicitynot minority gendermale
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " "*"
## 4 ( 1 ) " " " " "*" " "
## 5 ( 1 ) " " " " "*" "*"
## 6 ( 1 ) " " " " "*" "*"
## 7 ( 1 ) " " " " "*" "*"
## 8 ( 1 ) " " " " "*" "*"
## languagenon-english age cls_perc_eval cls_students cls_levelupper
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " "*" " " " "
## 5 ( 1 ) " " " " "*" " " " "
## 6 ( 1 ) " " " " "*" " " " "
## 7 ( 1 ) " " "*" "*" " " " "
## 8 ( 1 ) "*" "*" "*" " " " "
## cls_profssingle cls_creditsone credit bty_avg
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" "*"
## 3 ( 1 ) " " "*" "*"
## 4 ( 1 ) " " "*" "*"
## 5 ( 1 ) " " "*" "*"
## 6 ( 1 ) " " "*" "*"
## 7 ( 1 ) " " "*" "*"
## 8 ( 1 ) " " "*" "*"
## pic_outfitnot formal pic_colorcolor
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " " "
## 6 ( 1 ) " " "*"
## 7 ( 1 ) " " "*"
## 8 ( 1 ) " " "*"
summary(mod_sel)$adjr
## [1] 0.03993708 0.07975028 0.09974878 0.11853497 0.13661970 0.14409068
## [7] 0.15208877 0.15756485
plot(mod_sel)
# best model includes only cls_credits
Exercise 6:
# No, not the same. The adjusted R^2 for the latter model is much higher.